/*
Eric Steinbrook
Master Do-File
Guatemala CVD Risk Factor Prevalence
May 2022

5/11/22
 [x]add age standardized measurements of current alcohol use (currentetoh)
 [x]add non age standardized measurements of different HTN measures  
 [x]add different smoking measures
 [x]change logistic regression. We're only adjusting for sex, age, or sex and age. Our goal is to show prevalence, not to show associations.

5/16/22
 [x] generate new weights with the raking procedure to account for non-response by gender
 
 6/3/22
 [ ] add in a poverty line of 50% to the age and sex adjusted regressions
*/

*Data Cleaning
	
	clear all
	*load the original dataset
	use "/Users/eric/Dropbox (University of Michigan)/Flood/Guatemala_Diabetes/Data/CKDu_for_secondary_analysis_v12.dta"
	*Check for duplicate records
	duplicates list record_id, nolabel sepby(id) // record_id 701 is duplicated and needs to be deleted
	sort record_id //sort by record_id
	quietly by record_id: gen dup = cond(_N==1, 0,_n) //creates dup. Assigns 0 to unique variables, 1 for duplicate first occurence
	tabulate dup // only 1 duplicate.
	drop if dup > 1 // Drop the second copy of the duplicate
	duplicates report //check that duplicates are gone
	
*SVY weighting - adjusting the sampling weights for non-response bias
	/*
	
	In the KIR paper, Ann did not account for non-response bias of men in the overall
	results. Instead, she did stratified analyses using postweights. See her code 
	in "anl_prevalences.do."
	
	But given the sample skew so much toward women, I ***think*** it is best to
	make adjusted post-stratification weights for subsequent analyses. Peter agrees.
	
	The below numbers come from Ann:
	
	For sex post-estimation weighting. Numbers for poswgt come from estimated populations of Tecpan and SAS.
	Then assumed 62% of the population is over the age of 18 (CIA World Factbook 2016)
	And 50.7% is women. That is where the numbers below come from. This is the system you need to get the weighted estimates by site AND sex
		
	*women in SAS
	generate pstwgt=16345 if site==2 & sexo==2 
	
	*men in SAS
	replace pstwgt=15894 if site==2 & sexo==1
	
	*men in tecpan
	replace pstwgt=23249 if site==1 & sexo==1
	
	*women in tecpan
	replace pstwgt=26677 if site==1 & sexo==2
	tab pstwgt
	
	tecpan population = 49936 (60.8%)
	sas population = 32239 (39.2%)
	
	female population = 50.7%
	*/
	
	recode site 2=0, gen(tecpan)
	tab female tecpan
	
	*Download the IPFweight package
	ssc install ipfweight

	* See ipfweigh example here: https://bfschaffner.github.io/bookdown-stata/post-stratification-weights.html
		// We do this to adjust for the population parameters for sex and site
	ipfweight female tecpan, gen(pstwgt_w_ind_norm) val(49.3 50.7 39.2 60.8) maxit(25) st(w_ind_norm)
	label variable pstwgt_w_ind_norm "Final survey weight (adjusted to for population by sex and site)"
	
	browse record_id site hogar sexo  w_ind_norm pstwgt_w_ind_norm // can check out how weights were adjusted
	
	*Tell STATA that this is survey data
	svyset hogar [pw=pstwgt_w_ind_norm], strata(site)
	
	*Create subpop of male and female
	gen sample_male = 1 if sexo == 1
	gen sample_female = 1 if sexo == 2

/***************
Part 1: Table 1 (descriptive characteristics)
*/

	/*
	Compute 
		1. total observations, and total observations by sex
		2. the proportion (with 95% CI) of the population in each agecat
		3. same as 2, but stratified by sex*/
		
	*use egen and cut to create agecat, which group based on edad
	egen agecat = cut(edad), at(18,30,40,50,60,70,100) label // there are 4 observations missing age information

	*Verify that agecat was grouped correctly
	tabstat edad, by(agecat) stats(min max) //it was grouped correctly
	*Count the number total, male, and female
	tabstat record_id, by(sexo) stats(count) // 280 male, 526 female

	*estimate svy weighted proportions of agecat
	svy: proportion agecat, percent cformat (%9.1f) //whole group
			svy, subpop(sample_male): proportion agecat, percent cformat (%9.1f) // males only
			svy, subpop(sample_female): proportion agecat, percent cformat (%9.1f) //females only

	/*
		Task: combine university and secondary and make 3 level schoolcat*/
		

	*describe the format of the school level variable
	tab nivelescuela  // byte %34.0g
	tab escuela
	*look at values for nivelescuela
	svy: proportion nivelescuela, percent
	svy: proportion escuela, percent

	*generate new categories for education, and call the new variable schoolcat
	/*label define escuela_ 1 "Yes" 0 "No" 
	label define nivelescuela_ 1 "Preprimaria" 2 "Primaria" 3 "Basico" 4 "Diversificado" 5 "Universidad Tecnico" 6 "Universidad Licenciatura" 7 "Maestria o doctorado universitario" */
	*Define schoolcat = 0 if escuela = no or if nivelescuela = preprimary

			generate schoolcat = .
					replace schoolcat = 3 if (nivelescuela == 4 | nivelescuela == 5 | nivelescuela == 6 | nivelescuela == 7)
					replace schoolcat = 2 if (nivelescuela == 3 | nivelescuela == 2)
					replace schoolcat = 1 if (nivelescuela == 1 | escuela == 0)
					
	*Check that schoolcat was assigned correctly
	tabstat schoolcat nivelescuela, by(schoolcat) stats (count)

	*Label schoolcat variables		
			label define schoolcat_label 1 "No primary education" 2 "Primary school" 3 "Secondary school or higher"
			label values schoolcat schoolcat_label
			lab var schoolcat "Highest education achieved"
			
	*estimate svy weighted proportions of schoolcat:
				svy: proportion schoolcat, percent cformat (%9.1f)
				svy, subpop(sample_male): proportion schoolcat, percent cformat (%9.1f)
				svy, subpop (sample_female): proportion schoolcat, percent cformat (%9.1f)
		
	*Ethnicity
		*Combine xinca and ladino as non-indigenous
	gen ethn =.
		replace ethn = 2 if comoidentif==1
		replace ethn = 1 if comoidentif ==2 | comoidentif==4
		lab define ethn_label2 2 "Indigenous/Maya" 1 "Non-indigenous"
		lab values ethn ethn_label2
		
		svy: proportion ethn, percent cformat (%9.1f)
		svy, subpop(sample_male): proportion ethn, percent cformat (%9.1f)
		svy, subpop (sample_female): proportion ethn, percent cformat (%9.1f)
		
		
	*estimate svy weighted proportions for % below national poverty line
		*rawqpsscore is 0 to 100 score from a QPS survey (0 corresponds to poverty, 100 to a low poverty likelihood)
		*qps_prob_pobr_100 - probability of being below the national poverty line (0 is 0% chance of poverty, 1 is 100% chance of poverty)
		*qps_prob_comida - probability of being below the severe food poverty line
		*Peter explaining QPS: "the QPS gives a probability that a family is below the poverty line. what we've done in the paper is summed those probabilities and given a mean probability, which we then call a proportion in the vernacular sense, which i think is perhaps not completely mathematically true but justified as it is a rigorous random/representative sample. but this only works at the sample/population/summary level. At the individual level, we only have individual probabilities, which can't be converted to individual level binary values."
		
		svy: mean qps_prob_pobr_100	
		svy, subpop (sample_male): mean qps_prob_pobr_100
		svy, subpop (sample_female): mean qps_prob_pobr_100
	
	
/******************
Appendix table 1
Age Standardization
*/

	*Generate variables for Diabetes, HTN, BMIcat, and smoking
	
	*Diabetes**
	 *diabetes // a1c>=6.5 or self-report
	 *diabetesonlya1c

	*Confirmed hypertension, >=140/90 mmhg
	*Note:
	/* 
	I'm following the convention defined in NHANES, which eliminates the first blood pressure measurement (sist1 and diast1) to get more accurate measurements 
	We therefore will NOT be using the sistfinal and diastfinal variables that Ann created, because these didn't follow this convention
	For more info, see https://www.cdc.gov/nchs/data/nhanes/nhanes_11_12/Physician_Exam_Manual.pdf and https://jamanetwork.com/journals/jama/article-abstract/185953
	Note: all SBP readings less than 70 mm Hg and DBP < 40 and DBP >120 have already been deleted due to high likelihood of spurious results.*/
	*average the second and third blood pressure measurements
	egen sist_avg=rmean(sist2 sist3)
	egen diast_avg=rmean(diast2 diast3)
	lab var sist_avg "Mean systolic BP"
	lab var diast_avg "Mean diastolic BP"

	*Hypertension >=140/90
	gen htn140=.
	replace htn140=0 if sist_avg <140 & diast_avg <90
	replace htn140=1 if sist_avg >= 140 | diast_avg >= 90
	lab var htn140 "hypertension from BP 140/90"

	
	*Hypertension >=140/90 or self-report
	*Note:
	/* 
	presionsujet is a survey question asking if patients have ever been diagnosed with hypertension. 
	1=yes, 2=no, 3=don't know, 4=I don't know what high blood pressure is */
	gen hypertensive140clinical=.
	replace hypertensive140clinical=0 if sist_avg < 140 & diast_avg < 90 & presionsujet != 1
	replace hypertensive140clinical=1 if sist_avg >= 140 | diast_avg >= 90 | presionsujet == 1
	replace hypertensive140clinical=. if sist_avg == . | diast_avg == . & presionsujet != 1
	lab var hypertensive140clinical "hypertension from BP 140/90 or patient report"

	*Hypertension, >=130/80 or self-report
	gen hypertensive130clinical=.
	replace hypertensive130clinical=0 if sist_avg < 130 & diast_avg < 80 & presionsujet != 1
	replace hypertensive130clinical=1 if sist_avg >= 130 | diast_avg >= 80 | presionsujet == 1
	replace hypertensive130clinical=. if sist_avg == . | diast_avg == . & presionsujet != 1
	lab var hypertensive130clinical "hypertension from BP 130/80 or patient report"
		
	*Hypertension >=130/80
	gen htn130=.
	replace htn130=0 if sist_avg <130 & diast_avg <80
	replace htn130=1 if sist_avg >= 130 | diast_avg >= 80
	lab var htn130 "hypertension from BP 130/80"


	*BMI/obesity
	*Notes:
	/*
	1. BMI calculated from mean of 3 seperate measurements of height (tallafinal) and weight (pesofinal).
	2. imc is a continuous variable of the BMI. It's calculated by: pesofinalkg / ((tallafinal/100)^2) 
	3. imccat lumps all categories of obesity together, so I'm generating new variable that brings out those categories
	*/
	gen bmicat=.
	replace bmicat=0 if imc<18.5
	replace bmicat=1 if imc>=18.5 & imc<25
	replace bmicat=2 if imc>=25 & imc<30
	replace bmicat=3 if imc>=30
	replace bmicat=. if imc==. 

	label define bmilabel 0 "BMI underweight" 1 "BMI normal." 2 "BMI overweight" 3 "BMI Obese"
	label values bmicat bmilabel

	gen obesity=.
		replace obesity = 1 if imc>=30
		replace obesity = 0 if imc<=30
		replace obesity =. if imc==.
	label define obes_label 1 "obese(BMI>=30)" 2 "not obese (BMI<=30)"
	label values obesity obes_label

	*Smoking
	*Notes:
	/*
	1. fumar is defined as: 1: daily smoker 2: less than daily 3: never smoker
	2. fumarantes is defined as having smoked before, regardless of whether the person is a current smoker
	3. cuantoscigarros is cigs smoked per day and cuantosenlasemana is cigs smoked per day for those who 
	answered yest to fumarantes
	4. There are different values for cuantoscigarros and cuantosenlasemana
	*/

	/*explore whether there's overlap between fumar and fumarantes
	gen fum_test=.
		replace fum_test = 0 if fumar > 2 | fumarantes >2 //non-smokers
		replace fum_test = 1 if fumar >2 | fumarantes <= 2 //former-smokers
		replace fum_test = 2 if fumar <=2 // current-smokers
		replace fum_test = 3 if fumar < 3 & fumarantes < 3 //current smoker and former smoker
		replace fum_test = . if fumar == . | fumarantes == .
		
		label define fum_label 0 "non-smokers" 1 "former-smokers" 2 "current-smokers" 3 "current and former smokers"
		label values fum_test fum_label
		
	tabstat record_id, by(fum_test) stats (count) //735 former-smokers, 11 current smokers, and 55 current and former smokers*/

	*Current smokers
	gen current_smoker=.
		replace current_smoker = 0 if fumar >2 //non-smokers
		replace current_smoker = 1 if fumar <= 2 //current-smokers
		replace current_smoker = . if fumar == . //missing
		
		label define current_smoker_label 0 "non-smokers" 1 "current-smokers"
		label values current_smoker current_smoker_label
		
	*Lifetime smokers**
		gen tobacco_use=.
			replace tobacco_use=2 if fumar<=2|fumarantes<=2
			replace tobacco_use=1 if fumar==3&fumarantes==3
			replace tobacco_use=. if current_smoker==.
		label define tobacco_label 2 "any lifetime tobacco use" 1 "never smoker"
		label values tobacco_use tobacco_label


	

	*Prevalence single morbidity
	svy: proportion diabetesonlya1c, percent cformat (%9.1f)
	svy: proportion diabetes, percent cformat (%9.1f)
	svy: proportion htn140, percent cformat (%9.1f)
	svy: proportion hypertensive140clinical, percent cformat (%9.1f)
	svy: proportion htn130, percent cformat (%9.1f)
	svy: proportion hypertensive130clinical, percent cformat (%9.1f)
	svy: proportion bmicat, percent cformat (%9.1f)
	svy: proportion current_smoker, percent cformat (%9.1f)
	svy: proportion tobacco_use, percent cformat(%9.1f)
	

	*Prevalence in males
	svy, subpop(sample_male): proportion diabetesonlya1c, percent cformat (%9.1f)	
	svy, subpop(sample_male): proportion diabetes, percent cformat (%9.1f)
	svy, subpop(sample_male): proportion htn140, percent cformat (%9.1f)
	svy, subpop(sample_male): proportion hypertensive140clinical, percent cformat (%9.1f)
	svy, subpop(sample_male): proportion htn130, percent cformat (%9.1f)
	svy, subpop(sample_male): proportion hypertensive130clinical, percent cformat (%9.1f)
	svy, subpop(sample_male): proportion bmicat, percent cformat (%9.1f)
	svy, subpop(sample_male): proportion current_smoker, percent cformat (%9.1f)
	svy, subpop(sample_male): proportion tobacco_use, percent cformat(%9.1f)

	*Prevalence in females
	svy, subpop(sample_female): proportion diabetesonlya1c, percent cformat (%9.1f)
	svy, subpop(sample_female): proportion diabetes, percent cformat (%9.1f)
	svy, subpop(sample_female): proportion htn140, percent cformat (%9.1f)
	svy, subpop(sample_female): proportion hypertensive140clinical, percent cformat (%9.1f)
	svy, subpop(sample_female): proportion htn130, percent cformat (%9.1f)
	svy, subpop(sample_female): proportion hypertensive130clinical, percent cformat (%9.1f)
	svy, subpop(sample_female): proportion bmicat, percent cformat (%9.1f)
	svy, subpop(sample_female): proportion current_smoker, percent cformat (%9.1f)
	svy, subpop(sample_female): proportion tobacco_use, percent cformat(%9.1f)


	**Prevalence of two comorbidities
	gen dm_htn=.
		replace dm_htn = 1 if diabetes==1 & hypertensive140clinical==1
		replace dm_htn = 0 if diabetes!=1 | hypertensive140clinical!=1
		replace dm_htn = . if diabetes==. | hypertensive140clinical==.
		lab var dm_htn "diabetes and hypertension (130/80)"

	gen dm_obesity=.
		replace dm_obesity = 1 if diabetes==1 & bmicat==3
		replace dm_obesity = 0 if diabetes!=1 | bmicat!=3
		replace dm_obesity = . if diabetes==. | bmicat==.
		lab var dm_obesity "diabetes and obesity"
		
	gen dm_smoking=.
		replace dm_smoking = 1 if diabetes==1 & current_smoker==1
		replace dm_smoking = 0 if diabetes!=1 | current_smoker!=1
		replace dm_smoking = . if diabetes==. | current_smoker==.
		lab var dm_smoking "diabetes and smoking"
		
	gen htn_obesity=.
		replace htn_obesity = 1 if hypertensive140clinical==1 & bmicat==3
		replace htn_obesity = 0 if hypertensive140clinical!=1 | bmicat!=3
		replace htn_obesity = . if  hypertensive140clinical==. | bmicat==.
		lab var htn_obesity "Hypertension (130/80) and obesity"

	gen htn_smoking=.
		replace htn_smoking = 1 if hypertensive140clinical==1 & current_smoker==1
		replace htn_smoking = 0 if hypertensive140clinical!=1 | current_smoker!=1
		replace htn_smoking = . if hypertensive140clinical==. | current_smoker==.
		lab var htn_smoking "Hypertension (130/80) and current smoker"
		
	gen obesity_smoking=.
		replace obesity_smoking=1 if bmicat==3 & current_smoker==1
		replace obesity_smoking=0 if bmicat!=3 | current_smoker!=1
		replace obesity_smoking=. if bmicat==. | current_smoker==.
		lab var obesity_smoking "Obesity and current smoker"

	gen etoh_dm=. 
		replace etoh_dm = 1 if currentetoh==3 & diabetes==1
		replace etoh_dm = 0 if currentetoh!=3 | diabetes!=1
		replace etoh_dm = . if currentetoh==. | diabetes==.
		lab var etoh_dm "excessive alcohol and diabetes"

	gen etoh_htn=.
		replace etoh_htn = 1 if currentetoh==3 & hypertensive140clinical==1
		replace etoh_htn = 0 if currentetoh!=3 | hypertensive140clinical!=1
		replace etoh_htn = . if currentetoh==. | hypertensive140clinical==.
		lab var etoh_htn "excessive alcohol and hypertension 140/90"
		
	gen etoh_obesity=.
		replace etoh_obesity = 1 if currentetoh==3 & bmicat==3
		replace etoh_obesity = 0 if currentetoh!=3 | bmicat!=3
		replace etoh_obesity = . if currentetoh==. | bmicat==.
		lab var etoh_obesity "excessive alcohol and obesity"

	gen etoh_smoking=.
		replace etoh_smoking = 1 if currentetoh==3 & current_smoker==1
		replace etoh_smoking = 0 if currentetoh!=3 | current_smoker!=1
		replace etoh_smoking = . if currentetoh==. | current_smoker==.
		lab var etoh_smoking "excessive alcohol and smoking"
		
	svy: tabulate dm_htn, count
	svy: tabulate dm_obesity, count
	svy: tabulate dm_smoking, count
	svy: tabulate htn_obesity, count
	svy: tabulate htn_smoking, count
	svy: tabulate obesity_smoking, count

	*Prevalence in males
	svy, subpop(sample_male): tabulate dm_htn, count
	svy, subpop(sample_male): tabulate dm_obesity, count
	svy, subpop(sample_male): tabulate dm_smoking, count
	svy, subpop(sample_male): tabulate htn_obesity, count
	svy, subpop(sample_male): tabulate htn_smoking, count
	svy, subpop(sample_male): tabulate obesity_smoking, count

	*Prevalence in females
	svy, subpop(sample_female): tabulate dm_htn, count
	svy, subpop(sample_female): tabulate dm_obesity, count
	svy, subpop(sample_female): tabulate dm_smoking, count
	svy, subpop(sample_female): tabulate htn_obesity, count
	svy, subpop(sample_female): tabulate htn_smoking, count
	svy, subpop(sample_female): tabulate obesity_smoking, count

	**Prevalence of 3 or more comorbidities
	gen t2dm_htn_obesity=.
		replace t2dm_htn_obesity=1 if diabetes==1 & hypertensive140clinical==1 & bmicat==3
		replace t2dm_htn_obesity=0 if diabetes!=1 | hypertensive140clinical!=1 | bmicat!=3
		replace t2dm_htn_obesity=. if diabetes==. | hypertensive140clinical==. | bmicat==.
		lab var t2dm_htn_obesity "Diabetes, Hypertension, and Obesity"


	gen t2dm_htn_smok=.
		replace t2dm_htn_smok=1 if diabetes==1 & hypertensive140clinical==1 & current_smoker==1
		replace t2dm_htn_smok=0 if diabetes!=1 | hypertensive140clinical!=1 | current_smoker!=1
		replace t2dm_htn_smok=. if diabetes==. | hypertensive140clinical==. | current_smoker==.
		lab var t2dm_htn_smok "Diabetes, Hypertension, Smoking"

	gen t2dm_obesity_smok=.
		replace t2dm_obesity_smok=1 if diabetes==1 & bmicat==3 & current_smoker==1
		replace t2dm_obesity_smok=0 if diabetes!=1 | bmicat!=3 | current_smoker!=1
		replace t2dm_obesity_smok=. if diabetes==. | bmicat==. | current_smoker==.
		lab var t2dm_obesity_smok "Diabetes, obesity, and smoking"

	gen htn_obes_smok=.
		replace htn_obes_smok=1 if hypertensive140clinical==1 & bmicat==3 & current_smoker==1
		replace htn_obes_smok=0 if hypertensive140clinical!=1 | bmicat!=3 | current_smoker!=1
		replace htn_obes_smok=. if hypertensive140clinical==. | bmicat==. | current_smoker==.
		lab var htn_obes_smok "Hypertension, Obesity, and smoking"
		
	gen etoh_t2dm_htn=.
		replace etoh_t2dm_htn=1 if currentetoh==3 & diabetes==1 & hypertensive140clinical==1
		replace etoh_t2dm_htn=0 if currentetoh!=3 | diabetes!=1 & hypertensive140clinical!=1
		replace etoh_t2dm_htn=. if currentetoh==. | diabetes==. | hypertensive140clinical==.
		lab var etoh_t2dm_htn "Excessive alcohol, diabetes, and hypertension"
		
	gen etoh_t2dm_obesity=.
		replace etoh_t2dm_htn=1 if currentetoh==3 & diabetes==1 & bmicat==3
		replace etoh_t2dm_htn=0 if currentetoh!=3 | diabetes!=1 & bmicat!=3
		replace etoh_t2dm_htn=. if currentetoh==. | diabetes==. | bmicat==.
		lab var etoh_t2dm_htn "Excessive alcohol, diabetes, and obesity"
		
	gen etoh_t2dm_smok=.
		replace etoh_t2dm_htn=1 if currentetoh==3 & diabetes==1 & current_smoker==1
		replace etoh_t2dm_htn=0 if currentetoh!=3 | diabetes!=1 & current_smoker!=1
		replace etoh_t2dm_htn=. if currentetoh==. | diabetes==. | current_smoker==.
		lab var etoh_t2dm_htn "Excessive alcohol, diabetes, and obesity"
		
	gen etoh_htn_smok=.
		replace etoh_htn_smok=1 if currentetoh==3 & hypertensive140clinical==1 & current_smoker==1
		replace etoh_htn_smok=0 if currentetoh!=3 | hypertensive140clinical!=1 | current_smoker!=1
		replace etoh_htn_smok=. if currentetoh==. | hypertensive140clinical==. | current_smoker==.
		lab var etoh_htn_smok "Excessive alcohol, Hypertension, and tobacco use"

	gen etoh_htn_obesity=.
		replace etoh_htn_smok=1 if currentetoh==3 & hypertensive140clinical==1 & bmicat==3
		replace etoh_htn_smok=0 if currentetoh!=3 | hypertensive140clinical!=1 | bmicat!=3
		replace etoh_htn_smok=. if currentetoh==. | hypertensive140clinical==. | bmicat==.
		lab var etoh_htn_smok "Excessive alcohol, Hypertension, and obesity"

	gen etoh_obesity_smok=.
		replace etoh_obesity_smok=1 if currentetoh==3 & bmicat==3 & current_smoker==1
		replace etoh_obesity_smok=0 if currentetoh!=3 | bmicat!=3 | current_smoker!=1
		replace etoh_obesity_smok=. if currentetoh==. | bmicat==. | current_smoker==.
		lab var etoh_obesity_smok "Excessive alcohol, Obesity, and smoking"
		
	*Prevalence of 4 or more comorbidities
	gen t2dm_htn_obesity_smok=.
		replace t2dm_htn_obesity_smok=1 if diabetes==1 & hypertensive140clinical==1 & bmicat==3 & current_smoker==1
		replace t2dm_htn_obesity_smok=0 if diabetes!=1 | hypertensive140clinical!=1 | bmicat!=3 | current_smoker!=1
		replace t2dm_htn_obesity_smok=. if diabetes==. | hypertensive140clinical==. | bmicat==. | current_smoker==.
		lab var t2dm_htn_obesity_smok "Diabetes, Hypertension, obesity, and smoking"

	gen t2dm_htn_obesity_etoh=.
		replace t2dm_htn_obesity_etoh=1 if diabetes==1 & hypertensive140clinical==1 & bmicat==3 & currentetoh==3
		replace t2dm_htn_obesity_etoh=0 if diabetes!=1 | hypertensive140clinical!=1 | bmicat!=3 | currentetoh!=3
		replace t2dm_htn_obesity_etoh=. if diabetes==. | hypertensive140clinical==. | bmicat==. | currentetoh==.
		lab var t2dm_htn_obesity_etoh "Diabetes, Hypertension, obesity, and excessive alcohol"

	gen t2dm_htn_etoh_smok=.
		replace t2dm_htn_etoh_smok=1 if diabetes==1 & hypertensive140clinical==1 & current_smoker==1 & currentetoh==3
		replace t2dm_htn_etoh_smok=0 if diabetes!=1 | hypertensive140clinical!=1 | current_smoker!=1 | currentetoh!=3
		replace t2dm_htn_etoh_smok=. if diabetes==. | hypertensive140clinical==. | current_smoker==. | currentetoh==.
		lab var t2dm_htn_etoh_smok "Diabetes, Hypertension, excessive alcohol, and current smoker"

	gen t2dm_etoh_obesity_smok=.
		replace t2dm_htn_etoh_smok=1 if diabetes==1 & bmicat==3 & current_smoker==1 & currentetoh==3
		replace t2dm_htn_etoh_smok=0 if diabetes!=1 | bmicat!=3 | current_smoker!=1 | currentetoh!=3
		replace t2dm_htn_etoh_smok=. if diabetes==. | bmicat==. | current_smoker==. | currentetoh==.
		lab var t2dm_htn_etoh_smok "Diabetes, excessive alcohol, obesity, and current smoker"

	gen htn_etoh_obesity_smok=.
		replace htn_etoh_obesity_smok=1 if hypertensive140clinical==1 & bmicat==3 & current_smoker==1 & currentetoh==3
		replace htn_etoh_obesity_smok=0 if hypertensive140clinical!=1 | bmicat!=3 | current_smoker!=1 | currentetoh!=3
		replace htn_etoh_obesity_smok=. if hypertensive140clinical==. | bmicat==. | current_smoker==. | currentetoh==.
		lab var htn_etoh_obesity_smok "Hypertension, excessive alcohol, obesity, and current smoker"

		table t2dm_htn_obesity_smok
		table t2dm_htn_obesity_etoh
		table t2dm_htn_etoh_smok
		*table t2dm_etoh_obesity_smok no observations
		table htn_etoh_obesity_smok

		
	*prevalence of 5 comorbidities
	gen t2dm_htn_obesity_smok_etoh=.
		replace t2dm_htn_obesity_smok_etoh=1 if diabetes==1 & hypertensive140clinical==1 & bmicat==3 & current_smoker==1 & currentetoh==3
		replace t2dm_htn_obesity_smok_etoh=0 if diabetes!=1 | hypertensive140clinical!=1 | bmicat!=3 | current_smoker!=1 | currentetoh!=3
		replace t2dm_htn_obesity_smok_etoh=. if diabetes==. | hypertensive140clinical==. | bmicat==. | current_smoker==. | currentetoh==.
		lab var t2dm_htn_obesity_smok_etoh "Diabetes, Hypertension, obesity, smoking, and excessive alcohol"
		
	svy: tabulate t2dm_htn_obesity, count
	svy: tabulate t2dm_htn_smok, count
	svy: tabulate t2dm_obesity_smok, count
	svy: tabulate htn_obes_smok, count
	svy: tabulate t2dm_htn_obesity_smok, count

	*prevalence in males
	svy, subpop(sample_male): tabulate t2dm_htn_obesity, count 
	svy, subpop(sample_male): tabulate t2dm_htn_smok, count
	svy, subpop(sample_male): tabulate t2dm_obesity_smok, count
	svy, subpop(sample_male): tabulate htn_obes_smok, count
	svy, subpop(sample_male): tabulate t2dm_htn_obesity_smok, count

	*prevalence in females
	svy, subpop(sample_female): tabulate t2dm_htn_obesity, count 
	svy, subpop(sample_female): tabulate t2dm_htn_smok, count
	svy, subpop(sample_female): tabulate t2dm_obesity_smok, count
	svy, subpop(sample_female): tabulate htn_obes_smok, count
	svy, subpop(sample_female): tabulate t2dm_htn_obesity_smok, count

	****************
	*Proportions
	svy: proportion diabetes, percent
	svy: proportion hypertensive140clinical, percent
	svy: proportion bmicat, percent
	svy: proportion current_smoker, percent

	svy: proportion dm_htn, percent
	svy: proportion dm_obesity, percent
	svy: proportion dm_smoking, percent
	svy: proportion htn_obesity, percent
	svy: proportion htn_smoking, percent
	svy: proportion obesity_smoking, percent

	svy: proportion t2dm_htn_obesity, percent
	svy: proportion t2dm_htn_smok, percent
	svy: proportion t2dm_obesity_smok, percent
	svy: proportion htn_obes_smok, percent
	svy: proportion t2dm_htn_obesity_smok, percent

	*Prevalence in males
	svy, subpop(sample_male): proportion diabetes, percent
	svy, subpop(sample_male): proportion hypertensive140clinical, percent
	svy, subpop(sample_male): proportion bmicat, percent
	svy, subpop(sample_male): proportion current_smoker, percent

	svy, subpop(sample_male): proportion dm_htn, percent
	svy, subpop(sample_male): proportion dm_obesity, percent
	svy, subpop(sample_male): proportion dm_smoking, percent
	svy, subpop(sample_male): proportion htn_obesity, percent
	svy, subpop(sample_male): proportion htn_smoking, percent
	svy, subpop(sample_male): proportion obesity_smoking, percent

	svy, subpop(sample_male): proportion t2dm_htn_obesity, percent 
	svy, subpop(sample_male): proportion t2dm_htn_smok, percent
	svy, subpop(sample_male): proportion t2dm_obesity_smok, percent
	svy, subpop(sample_male): proportion htn_obes_smok, percent
	svy, subpop(sample_male): proportion t2dm_htn_obesity_smok, percent


	*Prevalence in females
	svy, subpop(sample_female): proportion diabetes, percent 
	svy, subpop(sample_female): proportion hypertensive140clinical, percent
	svy, subpop(sample_female): proportion bmicat, percent
	svy, subpop(sample_female): proportion current_smoker, percent

	svy, subpop(sample_female): proportion dm_htn, percent
	svy, subpop(sample_female): proportion dm_obesity, percent
	svy, subpop(sample_female): proportion dm_smoking, percent
	svy, subpop(sample_female): proportion htn_obesity, percent
	svy, subpop(sample_female): proportion htn_smoking, percent
	svy, subpop(sample_female): proportion obesity_smoking, percent

	svy, subpop(sample_female): proportion t2dm_htn_obesity, percent 
	svy, subpop(sample_female): proportion t2dm_htn_smok, percent
	svy, subpop(sample_female): proportion t2dm_obesity_smok, percent
	svy, subpop(sample_female): proportion htn_obes_smok, percent
	svy, subpop(sample_female): proportion t2dm_htn_obesity_smok, percent

	*obesity prevalence by age, stratified by gender
	*Prepare age variables

	svy: proportion obesity agecat
	svy, subpop(sample_male): proportion obesity agecat
	svy, subpop(sample_female): proportion obesity agecat
	
	******************
	*Age Standardized estimates
			*Age standardized estimates to the WHO world population estimates
			*1. Prepare age groups
		gen age_group = ""
			replace age_group = "0-4" if inrange(edad,0,4)
			replace age_group = "5-9" if inrange(edad,5,9)
			replace age_group = "10-14" if inrange(edad,10,14)
			replace age_group = "15-19" if inrange(edad,15,19)
			replace age_group = "20-24" if inrange(edad,20,24)
			replace age_group = "25-29" if inrange(edad,25,29)
			replace age_group = "30-34" if inrange(edad,30,34)
			replace age_group = "35-39" if inrange(edad,35,39)
			replace age_group = "40-44" if inrange(edad,40,44)
			replace age_group = "45-49" if inrange(edad,45,49)
			replace age_group = "50-54" if inrange(edad,50,54)
			replace age_group = "55-59" if inrange(edad,55,59)
			replace age_group = "60-64" if inrange(edad,60,64)
			replace age_group = "65-69" if inrange(edad,65,69)
			replace age_group = "70-74" if inrange(edad,70,74)
			replace age_group = "75-79" if inrange(edad,75,79)
			replace age_group = "80-84" if inrange(edad,80,84)
			replace age_group = "85-89" if inrange(edad,85,89)
			replace age_group = "90-94" if inrange(edad,90,94)
			replace age_group = "95-99" if inrange(edad,95,99)
			replace age_group = "100+" if edad>100
			
			*2. merge WHO standard population data with working database, by age_group
				*delete the _merge variable, which is leftover from a previous merge
		drop _merge
		merge m:1 age_group using "/Users/eric/Dropbox (University of Michigan)/Flood_Diabetes/Guatemala_Diabetes/Code Examples/David_Template_do_files/Age_Standardization/WHO_age_standardization.dta"

			*3. Perform age standardization. age_group and WHOWorldStandard come from the
			* WHO world standard population: https://seer.cancer.gov/stdpopulations/world.who.html

		*************************
		*lab-confirmed diabetes (1 is diabetic, 0 is not)
		svy: proportion diabetesonlya1c, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)
		svy, subpop(sample_male): proportion diabetesonlya1c, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)
		svy, subpop(sample_female): proportion diabetesonlya1c, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)

		*lab-confirmed and self-reported diabetes (1 is diabetic, 0 is not)
		svy: proportion diabetes, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)
		svy, subpop(sample_male): proportion diabetes, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)
		svy, subpop(sample_female): proportion diabetes, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)

		*Confirmed hypertension, >=140/90 mmhg or self-report

		svy: proportion hypertensive140clinical, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)
		svy, subpop(sample_male): proportion hypertensive140clinical, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)
		svy, subpop(sample_female): proportion hypertensive140clinical, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)

		*Hypertension, >=130/80 or self-report

		svy: proportion hypertensive130clinical, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)
		svy, subpop(sample_male): proportion hypertensive130clinical, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)
		svy, subpop(sample_female): proportion hypertensive130clinical, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)

		*BMI

		svy: proportion bmicat, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)
		svy, subpop(sample_male): proportion bmicat, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)
		svy, subpop(sample_female): proportion bmicat, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)

		*Current smokers

		svy: proportion current_smoker, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)
		svy, subpop(sample_male): proportion current_smoker, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)
		svy, subpop(sample_female): proportion current_smoker, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)
		
		*Excess Alcohol
		svy: proportion currentetoh, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)
		svy, subpop(sample_male): proportion currentetoh, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)
		svy, subpop(sample_female): proportion currentetoh, percent stdize(age_group) stdweight(WHOWorldStandard) cformat(%9.1f)
		

/*************
Part 3: svy weighted proportions of CVD risk Factors
Figure 1
See 050522_barplots.R for plot code
*/

**1. Diabetes**

	*PURE definition: "Diabetes was defined as either a fasting glucose > 7 mmol/dl or
		*self-reported history of diabetes, on treatment for diabetes."
	
	*Our definition: Diabetes was defined as either a glycosylate hemoglobin >= 6.5% or a self-reported history of diabetes
		
	*Creating the variable	
		*the variable defined above is already in our dataset, and is called "diabetes"

	svy: proportion diabetes, percent cformat(%9.1f)
	
	*subset by male
	svy, subpop(sample_male): proportion diabetes, percent cformat(%9.1f)
	
	*subset by female
	svy, subpop(sample_female): proportion diabetes, percent cformat(%9.1f)
	
	svy: proportion diabetes, percent cformat(%9.6f)
	
	*subset by male
	svy, subpop(sample_male): proportion diabetes, percent cformat(%9.6f)
	
	*subset by female
	svy, subpop(sample_female): proportion diabetes, percent cformat(%9.6f)

**2. Low Education**

	*PURE definition: "Education was self-reported, and classified as low (primary education level or less), 
		*intermediate (secondary school education) or high (college, trade school, or university education)"
		
	*Our definition: "Education was self-reported, and classified as low (primary education level or less), 
		*intermediate (secondary school education) or high (college, trade school, or university education)
		
	*Creating the education variable
		*Use nivelescuela
			*Nivel escuela definitions 1 "Preprimaria" 2 "Primaria" 3 "Basico" 4 "Diversificado" 5 "Universidad Tecnico" 6 "Universidad Licenciatura" 7 "Maestria o doctorado universitario"
		*Call the new variable educat
	
	generate educat = .
		replace educat = 3 if (nivelescuela ==7 | nivelescuela == 6 | nivelescuela == 5)
		replace educat = 2 if (nivelescuela == 4)
		replace educat = 1 if (nivelescuela == 3 | nivelescuela == 2 | nivelescuela == 1)
		replace educat = . if nivelescuela == .
	label define educat_label 3 "high (college, trade school, or university education)" 2 "intermediate (secondary school education)" 1 "low (primary education or less)"
	label values educat educat_label
	lab var educat "highest education level achieved"
	
	svy: proportion educat, percent cformat(%9.1f)
	
	*subset by male
	svy, subpop(sample_male): proportion educat, percent cformat(%9.1f)
	
	*subset by female
	svy, subpop(sample_female): proportion educat, percent cformat(%9.1f)
	
**3. Alcohol**

	*PURE definition: "Self-reported alcohol consumption using a standard alcohol consumption frequency 
		*questionnaire. Consumption was categorized as former, never and current. Current consumption is
		*further categorized as low (<=7 drinks/week), moderate (8-14drinks/week in women or 8-21 drinks/week in men), 
		*or highconsumption (> 14 drinks/week in women or >21 drinks/week in men)."
			*Excess alcohol use defined as either high current use or former use
			*Non-excess use defined as no history of alcohol consumption, low current use, or moderate current use.
			
	*Our definition: "Self-reported alcohol consumption through a standard alcohol consumption frequency questionnaire. Consumption
		*was categorized as former, never and current." Current consumption was further categorized in terms of frequency as none, monthly or less, 1-2 days per week, or more than 1-2 days per week. Excess alcohol consumption was defined as consuming alcohol more than 2 days per week.
		
	*Creating the alcohol variables
		*Create a new variable called alcohol_consumption with categories for former drinkers, never drinkers, and current drinkers
		
	generate alcohol_consumption = .
		replace alcohol_consumption = 3 if (alcohol == 0 & alcoholever == 1)
		replace alcohol_consumption = 2 if (alcohol == 0 & alcoholever == 0)
		replace alcohol_consumption = 1 if alcohol ==1
		replace alcohol_consumption =. if alcohol ==.
	label define alc_label 3 "former drinker (no drinks in the last 12 months)" 2 "never drinker" 1 "current drinker (at least one drink in last 12 months)"
	label values alcohol_consumption alc_label
	lab var alcohol_consumption "Current, former, or never user of Alcohol"
		
		*currentetoh already defines the different categories for daily drinkers in terms of frequency
	
	svy: proportion alcohol_consumption, percent cformat(%9.6f)
	svy: proportion currentetoh, percent cformat(%9.6f)
	
	*subset by male
	svy, subpop(sample_male): proportion currentetoh, percent cformat(%9.6f)
	
	*subset by female
	svy, subpop(sample_female): proportion currentetoh, percent cformat(%9.6f)

**4. Tobacco use**

	*PURE definition: "Self-reported tobacco consumption using a standard tobacco use frequency 
		*questionnaire, categorized as never, former or current.
			*1 = history of current or former tobacco use**
			*0 = no history of tobacco use
			
	*Our definition: "Self-reported tobacco consumption using a standard tobacco use frequency 
		*questionnaire, categorized as never, former or current.
		
	*Creating the tobacco variables
		*"fumar" definition: 1 "Si fumo diariamente" 2 "Si fumo, pero no a diario" 3 "No, no fumo para nada"
		*"fumarantes" definition: 1 "Si, antes fumaba diariamente" 2 "Si antes fumaba, pero no a diario" 3 "No, antes no fumaba para nada"
		
			/*gen current_smoker=.
			replace current_smoker = 0 if fumar >2 //non-smokers
			replace current_smoker = 1 if fumar <= 2 //current-smokers
			replace current_smoker = . if fumar == . //missing
			
			label define current_smoker_label 0 "non-smokers" 1 "current-smokers"
			label values current_smoker current_smoker_label*/
			
			**Current tobacco use**
			svy: proportion current_smoker, percent cformat(%9.6f)
			svy, subpop(sample_male): proportion current_smoker, percent cformat(%9.6f)
			svy, subpop(sample_female): proportion current_smoker, percent cformat(%9.6f)

	
	
	**Any lifetime tobacco use**
	svy: proportion tobacco_use, percent cformat(%9.1f)
	*subset by male
	svy, subpop(sample_male): proportion tobacco_use, percent cformat(%9.1f)
	
	*subset by female
	svy, subpop(sample_female): proportion tobacco_use, percent cformat(%9.1f)
					
**5. Abdominal obesity**
	
	*PURE defintion: "Waist and hip circumference was measured routinely in participants at baseline, and used to calculate the waist to hip ratio (WHR)"
		*Obesity: "WHR > 0.9 in men or 0.85 in women"
		*Non-obses: "WHR < 0.9 in men or 0.85 in women"
		
	*Our definition: BMI>=30
	
	/*gen obesity=.
		replace obesity = 1 if imc>=30
		replace obesity = 0 if imc<=30
		replace obesity =. if imc==.
	label define obes_label 1 "obese(BMI>=30)" 2 "not obese (BMI<=30)"
	label values obesity obes_label*/
	
	svy: proportion obesity, percent cformat(%9.6f)
	*subset by male
	svy, subpop(sample_male): proportion obesity, percent cformat(%9.6f)
	
	*subset by female
	svy, subpop(sample_female): proportion obesity, percent cformat(%9.6f)
	
**6. Hypertension**

	*PURE definition: >= 140mmHg, self-report of HTN, or Tx with anti-hypertensive medications
	*Our definition: 
		*hypertensive140clinical (does not include treatment with anti-hypertensives)
		
	svy: proportion hypertensive140clinical, percent cformat(%9.6f)
		*subset by male
	svy, subpop(sample_male): proportion hypertensive140clinical, percent cformat(%9.6f)
	
	*subset by female
	svy, subpop(sample_female): proportion hypertensive140clinical, percent cformat(%9.6f)
	
*Total population table
	svy: proportion diabetes, percent cformat(%9.6f)
	svy: proportion currentetoh, percent cformat(%9.6f)
	svy: proportion current_smoker, percent cformat(%9.6f)
	svy: proportion obesity, percent cformat(%9.6f)
	svy: proportion hypertensive140clinical, percent cformat(%9.6f)

*Male table
	svy, subpop(sample_male): proportion diabetes, percent cformat(%9.6f)
	svy, subpop(sample_male): proportion currentetoh, percent cformat(%9.6f)
	svy, subpop(sample_male): proportion current_smoker, percent cformat(%9.6f)
	svy, subpop(sample_male): proportion obesity, percent cformat(%9.6f)
	svy, subpop(sample_male): proportion hypertensive140clinical, percent cformat(%9.6f)
	
*female table
	svy, subpop(sample_female): proportion diabetes, percent cformat(%9.6f)
	svy, subpop(sample_female): proportion currentetoh, percent cformat(%9.6f)
	svy, subpop(sample_female): proportion current_smoker, percent cformat(%9.6f)
	svy, subpop(sample_female): proportion obesity, percent cformat(%9.6f)
	svy, subpop(sample_female): proportion hypertensive140clinical, percent cformat(%9.6f)
	
/*****************
Part 4: Multimorbidity tables and plot 
Figure 2 and appendix tables 2

See 050622_multimorbidity.R for plot creation
*/


*Appendix Table 2 and Figure 2 data
	*1 comorbidity
		generate onecomo=.
			replace onecomo=1 if (diabetes==1|hypertensive140clinical==1|obesity==1|current_smoker==1|currentetoh==3) // At least one comorbidity
			replace onecomo=0 if onecomo!=1 // 
			replace onecomo=. if (diabetes==.|hypertensive140clinical==.|obesity==.|current_smoker==.|currentetoh==.)
			table onecomo
		label define one_como_label 0 "no comorbidities" 1 "At least one comorbidity"
		label values onecomo one_como_label	
		
	*2 comorbidities
		generate twocomo=.
			replace twocomo=1 if (dm_htn==1 | dm_obesity==1 | dm_smoking==1 | htn_obesity==1 | htn_smoking==1 | obesity_smoking==1 | etoh_dm==1 | etoh_htn==1 | etoh_obesity==1 | 				etoh_smoking==1) //At least two comorbidities
			replace twocomo=0 if twocomo!=1
			replace twocomo=. if (dm_htn==. | dm_obesity==. | dm_smoking==. | htn_obesity==. | htn_smoking==. | obesity_smoking==. |etoh_dm==. | etoh_htn==. | etoh_obesity==. | 				etoh_smoking==.)
			table twocomo
		label define twocomo_label 0 "less than 2 comorbidities" 1 "at least two comorbidities"
		label values twocomo twocomo_label
		
	*3 comorbidities
		generate threecomo=.
			replace threecomo=1 if t2dm_htn_obesity==1 | t2dm_htn_smok==1 | t2dm_obesity_smok==1 | htn_obes_smok==1 | etoh_t2dm_htn==1 | etoh_t2dm_obesity==1 | etoh_t2dm_smok==1 | 			etoh_htn_smok==1 | etoh_htn_obesity==1 | etoh_obesity_smok==1
			replace threecomo=0 if threecomo != 1
			replace threecomo=. if (t2dm_htn_obesity==. | t2dm_htn_smok==. | t2dm_obesity_smok==. | htn_obes_smok==. | etoh_t2dm_htn==. | etoh_htn_smok==. | etoh_obesity_smok==.) //note that the variables with zero observations are eliminated here			
			lab var threecomo "three or more comorbidities"
			table threecomo
	
	*4 comorbidites
	gen fourcomo=.
		replace fourcomo=1 if (t2dm_htn_obesity_smok==1 | t2dm_htn_obesity_etoh==1 | t2dm_htn_etoh_smok==1 | t2dm_etoh_obesity_smok==1 | htn_etoh_obesity_smok==1)
		replace fourcomo=0 if fourcomo!=1
		replace fourcomo=. if (t2dm_htn_obesity_smok==. | t2dm_htn_obesity_etoh==. | t2dm_htn_etoh_smok==. | htn_etoh_obesity_smok==.)
		lab var fourcomo "four or more comorbidities"
		table fourcomo
			
	*Prevalence of comorbidities by age category
	svy: proportion onecomo, over(agecat) percent cformat(%9.6f)
	svy: proportion twocomo, over(agecat) percent cformat(%9.6f)
	svy: proportion threecomo, over(agecat) percent cformat(%9.6f)
	svy: proportion fourcomo, over(agecat) percent cformat(%9.6f)
	
	*Calculate proportion by age category for each individual CVD risk factor
	svy: proportion diabetes, over(agecat) percent cformat(%9.1f)
	svy: proportion hypertensive140clinical, over(agecat) percent cformat(%9.1f)
	svy: proportion obesity, over(agecat) percent cformat(%9.1f)
	svy: proportion current_smoker, over(agecat) percent cformat(%9.1f)

	*Compare current_smoker to current or former smoker
		gen anytobacco_use=.
			replace anytobacco_use=2 if fumar<=2|fumarantes<=2
			replace anytobacco_use=1 if fumar==3&fumarantes==3
			replace anytobacco_use=. if fumar==.
		label define tobacco_label2 2 "any lifetime tobacco use" 1 "never smoker"
		label values anytobacco_use tobacco_label2
		
		svy: proportion anytobacco_use, over(agecat) percent
		
/***************
Part 5:
Table 2 and appendices table S3 and S4- logistic regression
*/

*Prepare exposure variables for regression**


	*edadentr - use a square term for continuous age, then generate average marginal effects.
	/*edadentr (change from continuous to categorical).
		*6 categories
		gen agecat=.
		replace agecat=0 if edadentr>=18 & edadentr<30
		replace agecat= 1 if edadentr>=30 & edadentr<40
		replace agecat= 2 if edadentr>=40 & edadentr<50
		replace agecat= 3 if edadentr>=50 & edadentr<60
		replace agecat= 4 if edadentr>=60 & edadentr<70
		replace agecat= 5 if edadentr>=70
		replace agecat=. if edadentr==.
		label define agecat_label 0 "18-29" 1 "30-39" 2 "40-49" 3 "50-59" 4 "60-69" 5 "70+"
		label values agecat agecat_label
		tab agecat
		
		4 categories (18-34), (35-49), (50-64), (65+)
		gen agecat4 = .
		replace agecat4=0 if edadentr>=18 & edadentr < 35
		replace agecat4=1 if edadentr>=35 & edadentr < 50
		replace agecat4=2 if edadentr>=50 & edadentr < 65
		replace agecat4=3 if edadentr>=65
		replace agecat4=. if edadentr==.
		label define agecat4_label 0 "18-34" 1 "35-49" 2 "50-64" 3 "65+"
		label values agecat4 agecat4_label
		tab agecat4*/
	
	
	* sexo (categorical)
	
	* Education (3 levels categorical)

		*use schoolcat
		
	* Race/ethnicity (categorical)
		*use ethn
			
	* QPS (continuous)
	
		*(note - I'm not sure how Ann calculated this, because none of these qps variables correspond to the value reported in the CKDu paper)
		*qps_prob_pobr_100 - probability of being below the national poverty line
		*qps_prob_comida - probability of being below the severe food poverty line
		
		svy: tabulate qps_prob_pobr_100
		svy: tabulate rawqpsscore
		
		*Generate a new variable that categorizes people into 0%, 50%, and 100% of being at national poverty line

	
	* Obesity
		*PURE defintion: "Waist and hip circumference was measured routinely in participants at baseline, and used to calculate the waist to hip ratio (WHR)"
		*Obesity: "WHR > 0.9 in men or 0.85 in women"
		*Non-obses: "WHR < 0.9 in men or 0.85 in women"
		
		*Our definition: BMI>=30
		
		*use obesity (defined above)
	
		svy: proportion obesity
		
	* HTN (140/90)
		*PURE definition: >= 140mmHg, self-report of HTN, or Tx with anti-hypertensive medications
		*Our definition: self-report of HTN or >=140mmHg
			*I'm following the convention defined in NHANES, which eliminates sist1 and diast1 to get more accurate measurements 
			* We therefore will NOT be using the sistfinal and diastfinal variables that Ann created, because these didn't follow this convention
			* For more info, see https://www.cdc.gov/nchs/data/nhanes/nhanes_11_12/Physician_Exam_Manual.pdf and https://jamanetwork.com/journals/jama/article-abstract/185953
				*Note: all SBP readings less than 70 mm Hg and DBP < 40 and DBP >120 have already been deleted due to high likelihood of spurious results.
		
			*use hypertensive140clinical, defined above
	

		
	* Diabetes
		*PURE definition: "Diabetes was defined as either a fasting glucose > 7 mmol/dl or
			*self-reported history of diabetes, on treatment for diabetes."
		
		*Our definition: Diabetes was defined as either a glycosylate hemoglobin >= 6.5% or a self-reported history of diabetes
			
		*Creating the variable	
			*the variable defined above is already in our dataset, and is called "diabetes"
		svy: proportion diabetes	
	
/*Generate average marginal effects on the continuous variables
			*first, turn continuous variables into square terms.
			*Install spost13_ado to run the mlincom command
				net describe spost13_ado, from (https://jslsoc.sitehost.iu.edu/stata)
				net install spost13_ado*/

**Calculations for Table 2**				
				
* Obesity: Age- and sex-adjusted predicted probabilities and average marginal effects			
			
	* Age (sex-adjusted)
	svy: logit obesity c.edadentr##c.edadentr i.sexo, baselevels or cformat(%9.2f)
	margins, at(edadentr=(25(10)75)) post cformat(%9.3f)
	mlincom (2-1)
	mlincom (3-1)
	mlincom (4-1)
	mlincom (5-1)
	mlincom(6-1)

	* Sex (age-adjusted)
	svy: logit obesity c.edadentr##c.edadentr i.sexo, baselevels or cformat(%9.2f)
	margins i.sexo, cformat(%9.3f)
	margins, dydx(sex) cformat(%9.3f)
	
	svy: logit obesity c.edadentr##c.edadentr i.sexo, baselevels or cformat(%9.2f)
	margins i.sexo, cformat(%9.3f)
	margins, dydx(sex) cformat(%9.3f)

	* Education (age- and sex-adjusted)
	svy: logit obesity c.edadentr##c.edadentr i.sexo ib3.schoolcat, baselevels or cformat(%9.2f)
	margins ib3.schoolcat, cformat(%9.3f)
	margins, dydx(ib3.schoolcat) cformat(%9.3f)
	
	* Ethnicity (age- and sex-adjusted)
	svy: logit obesity c.edadentr##c.edadentr i.sexo ib2.ethn, baselevels or cformat(%9.2f)
	margins ib2.ethn, cformat(%9.3f)
	margins, dydx(ib2.ethn) cformat(%9.3f)

	* Poverty (age- and sex-adjusted)
	svy: logit obesity c.edadentr##c.edadentr i.sexo c.qps_prob_pobr_100##c.qps_prob_pobr_100, baselevels or cformat(%9.2f)
	margins, at(qps_prob_pobr_100=(0 50 100)) post cformat(%9.3f)
	mlincom (2-1)
	mlincom (3-1)

* Diabetes: Age- and sex-adjusted predicted probabilities and average marginal effects			
			
	* Age (sex-adjusted)
	svy: logit diabetes c.edadentr##c.edadentr i.sexo, baselevels or cformat(%9.2f)
	margins, at(edadentr=(25(10)75)) post cformat(%9.3f)
	mlincom (2-1)
	mlincom (3-1)
	mlincom (4-1)
	mlincom (5-1)
	mlincom(6-1)

	* Sex (age-adjusted)
	svy: logit diabetes c.edadentr##c.edadentr i.sexo, baselevels or cformat(%9.2f)
	margins i.sexo, cformat(%9.3f)
	margins, dydx(sex) cformat(%9.3f)

	* Education (age- and sex-adjusted)
	svy: logit diabetes c.edadentr##c.edadentr i.sexo ib3.schoolcat, baselevels or cformat(%9.2f)
	margins ib3.schoolcat, cformat(%9.3f)
	margins, dydx(ib3.schoolcat) cformat(%9.3f)
	
	* Ethnicity (age- and sex-adjusted)
	svy: logit diabetes c.edadentr##c.edadentr i.sexo ib2.ethn, baselevels or cformat(%9.2f)
	margins ib2.ethn, cformat(%9.3f)
	margins, dydx(ib2.ethn) cformat(%9.3f)

	* Poverty (age- and sex-adjusted)
	svy: logit diabetes c.edadentr##c.edadentr i.sexo c.qps_prob_pobr_100##c.qps_prob_pobr_100, baselevels or cformat(%9.2f)
	margins, at(qps_prob_pobr_100=(0 50 100)) post cformat(%9.3f)
	mlincom (2-1)
	mlincom (3-1)

* Hypertension: Age- and sex-adjusted predicted probabilities and average marginal effects			
			
	* Age (sex-adjusted)
	svy: logit hypertensive140clinical c.edadentr##c.edadentr i.sexo, baselevels or cformat(%9.2f)
	margins, at(edadentr=(25(10)75)) post cformat(%9.3f)
	mlincom (2-1)
	mlincom (3-1)
	mlincom (4-1)
	mlincom (5-1)
	mlincom(6-1)

	* Sex (age-adjusted)
	svy: logit hypertensive140clinical c.edadentr##c.edadentr i.sexo, baselevels or cformat(%9.2f)
	margins i.sexo, cformat(%9.3f)
	margins, dydx(sex) cformat(%9.3f)

	* Education (age- and sex-adjusted)
	svy: logit hypertensive140clinical c.edadentr##c.edadentr i.sexo ib3.schoolcat, baselevels or cformat(%9.2f)
	margins ib3.schoolcat, cformat(%9.3f)
	margins, dydx(ib3.schoolcat) cformat(%9.3f)
	
	* Ethnicity (age- and sex-adjusted)
	svy: logit hypertensive140clinical c.edadentr##c.edadentr i.sexo ib2.ethn, baselevels or cformat(%9.2f)
	margins ib2.ethn, cformat(%9.3f)
	margins, dydx(ib2.ethn) cformat(%9.3f)

	* Poverty (age- and sex-adjusted)
	svy: logit hypertensive140clinical c.edadentr##c.edadentr i.sexo c.qps_prob_pobr_100##c.qps_prob_pobr_100, baselevels or cformat(%9.2f)
	margins, at(qps_prob_pobr_100=(0 50 100)) post cformat(%9.3f)
	mlincom (2-1)
	mlincom (3-1)

**Logistic regression**
	**Outputs for supplementary tables S3 and S4
	
*outcome = obesity
			*age margins
			svy: logit obesity c.edadentr##c.edadentr i.sexo ib3..schoolcat c.qps_prob_pobr_100##c.qps_prob_pobr_100 ib2.ethn, baselevels or cformat(%9.2f)
			margins, at(edadentr=(25(10)75)) post cformat(%9.3f)
			mlincom (2-1)
			mlincom (3-1)
			mlincom (4-1)
			mlincom (5-1)
			mlincom(6-1)
			
			*poverty margins
			svy: logit obesity c.edadentr##c.edadentr i.sexo ib3..schoolcat c.qps_prob_pobr_100##c.qps_prob_pobr_100 ib2.ethn, baselevels or cformat(%9.2f)
			margins, at(qps_prob_pobr_100=(0 100)) post cformat(%9.3f)
			mlincom (2-1)
			
			*categorical predicted probabilities
			svy: logit obesity c.edadentr##c.edadentr i.sexo ib3..schoolcat c.qps_prob_pobr_100##c.qps_prob_pobr_100 ib2.ethn, baselevels or cformat(%9.2f)
			margins  i.sexo ib3.schoolcat ib2.ethn, vce(unconditional) post baselevels cformat(%9.3f)
			
			*categorical marginal probabilities
			svy: logit obesity c.edadentr##c.edadentr i.sexo ib3..schoolcat c.qps_prob_pobr_100##c.qps_prob_pobr_100 ib2.ethn, baselevels or cformat(%9.2f)
			margins, dydx(i.sexo ib3.schoolcat ib2.ethn) vce(unconditional) post baselevels  cformat(%9.3f)


*outcome = diabetes
			*age margins
			svy: logit diabetes c.edadentr##c.edadentr i.sexo ib3.schoolcat c.qps_prob_pobr_100##c.qps_prob_pobr_100 ib2.ethn, baselevels or cformat(%9.2f)
			margins, at(edadentr=(25(10)75)) post cformat(%9.3f)
			mlincom (2-1)
			mlincom (3-1)
			mlincom (4-1)
			mlincom (5-1)
			mlincom(6-1)
			
			*poverty margins
			svy: logit diabetes c.edadentr##c.edadentr i.sexo ib3.schoolcat c.qps_prob_pobr_100##c.qps_prob_pobr_100 ib2.ethn, baselevels or cformat(%9.2f)
			margins, at(qps_prob_pobr_100=(0 100)) post cformat(%9.3f)
			mlincom (2-1) 
			
			*categorical predicted probabilities
			svy: logit diabetes c.edadentr##c.edadentr i.sexo ib3.schoolcat c.qps_prob_pobr_100##c.qps_prob_pobr_100 ib2.ethn, baselevels or cformat(%9.2f)
			margins  i.sexo ib3.schoolcat ib2.ethn, vce(unconditional) post baselevels cformat(%9.3f)
			*categorical marginal probabilities
			svy: logit diabetes c.edadentr##c.edadentr i.sexo ib3.schoolcat c.qps_prob_pobr_100##c.qps_prob_pobr_100 ib2.ethn, baselevels or cformat(%9.2f)
			margins, dydx(i.sexo ib3.schoolcat ib2.ethn) vce(unconditional) post baselevels  cformat(%9.3f)
			
*outcome = hypertension (140/90)
			*age margins
			svy: logit hypertensive140clinical c.edadentr##c.edadentr i.sexo ib3.schoolcat c.qps_prob_pobr_100##c.qps_prob_pobr_100 ib2.ethn, baselevels or cformat(%9.2f)
			margins, at(edadentr=(25(10)75)) post cformat(%9.3f)
			mlincom (2-1)
			mlincom (3-1)
			mlincom (4-1)
			mlincom (5-1)
			mlincom(6-1)
			
			*poverty margins
			svy: logit hypertensive140clinical c.edadentr##c.edadentr i.sexo ib3.schoolcat c.qps_prob_pobr_100##c.qps_prob_pobr_100 ib2.ethn, baselevels or cformat(%9.2f)
			margins, at(qps_prob_pobr_100=(0 100)) post cformat(%9.3f)
			mlincom (2-1)
			
			*categorical predicted probabilities
			svy: logit hypertensive140clinical c.edadentr##c.edadentr i.sexo ib3.schoolcat c.qps_prob_pobr_100##c.qps_prob_pobr_100 ib2.ethn, baselevels or cformat(%9.2f)
			margins  i.sexo ib3.schoolcat ib2.ethn, vce(unconditional) post baselevels cformat(%9.3f)
			*categorical marginal probabilities
			svy: logit hypertensive140clinical c.edadentr##c.edadentr i.sexo ib3.schoolcat c.qps_prob_pobr_100##c.qps_prob_pobr_100 ib2.ethn, baselevels or cformat(%9.2f)
			margins, dydx(i.sexo ib3.schoolcat ib2.ethn) vce(unconditional) post baselevels cformat(%9.3f)		
